/* 
Data prep for state-level obs estimates
(1) read in dams at admin1 level
(2) starts with data at .05 degree level and aggregate to .5 degree to merge to coarser grid data
(3) aggregrate .5 degree grid data to administrative level 1

 follows:
* read_lights_raster.do (remote sensed droughts and lights), 
* read_ascii_cru.do (reads CRU sc-PDSI, temp and precip)
* read_other_rasters.do (to read in cross-sectional rasters) 
* read_pfaf4char.do (creates pfaf4_char.dta) --- dams and other pfaf4 characteristics

preceeds reg_nocell.do
*/
capture log close
log using merge_admin1, replace
timer on 1

* reads in data on dams by admin1 level (state)
* data from ArcGIS "spatial join" of GRanD to GADM level 1 
* target is admin1 and join feature are dams
* use "contains" and "one-to-many" and "keep all target features"
import delimited "../data/dam_admin1_join.csv", clear

**Generate dam counts

*first, gen separate vars for diff dam types
*will =0 for for missing uses
gen irrig=(use_irr=="Main" | use_irr=="Major")
gen hydro=(use_elec=="Main" | use_elec=="Major")
gen supply=(use_supp=="Main" | use_supp=="Major")
gen flood=(use_fcon=="Main" | use_fcon=="Major")


*clean/prep grand variables for collapsing by means/sums
*grand missing numerics are coded to -99
replace cap_mcm=. if cap_mcm<0
replace cap_rep=. if cap_rep<0
replace area_skm=. if area_skm<0
replace dor_pc=. if dor_pc<0
replace year=. if year<0
replace dam_hgt_m=. if dam_hgt_m<0
*6.88% of dams (472) missing height, only 8 dams missing cap_mcm 

*only dams with capacity>=.1km3 are necessarily included in GRanD 
* variable to restrict attention to these
* capacity in million m3 
gen largedam=(cap_mcm>=100) & !missing(cap_mcm)
gen largeirsuhy=largedam*(irrig | supply | hydro)
gen irsuhy=(irrig | supply | hydro)

*reservoir capacity for only hydro dams
gen cap_mcm_hydro=cap_mcm*hydro

** identifiers for country
rename target_fid admin1
rename gid_1 admin1id
rename gid_0 countryid
rename shape_area admin1area

label var admin1 "Admin1 # (corresponds to grid)"
label var admin1id "Admin1 code"

#delimit ;

collapse (sum) count_irr=irrig count_flood=flood count_supply=supply count_hydro=hydro 
count_large=largedam count_largeirsuhy=largeirsuhy count_irsuhy=irsuhy rescap=cap_mcm rescaphydro=cap_mcm_hydro resarea=area_skm height=dam_hgt_m (count) count_dams=grand_id , by(admin1 admin1id countryid admin1area) ;

#delimit cr 


gen ifdam=count_dams>0
gen ifhydro=count_hydro>0

drop if missing(admin1)

sort admin1

tempfile dams_admin1
save `dams_admin1', replace


/*
*need population density, not at pfaf4 level as elsewhere, so read in Gridded Population of the World a
*source is https://data.earthdata.nasa.gov/nasa-earth/human-dimensions/sedac-root/downloads/data/gpw-v4/gpw-v4-population-density-rev11/gpw-v4-population-density-rev11_2000_30_min_asc.zip
*/
clear
cd "../data"
ras2dta, file(gpw_popdens2000) genxcoord(x) genycoord(y) replace
* lights and droughts from 75 degrees N to -60 S so drop rest
drop if y<31 | y>300
replace y=y-30
rename gpw_popdens2000 pop2000
label var pop2000 "People per sq km in 2000 (grid cell)"
sort y x
tempfile popgrid 
save `popgrid', replace
erase gpw_popdens2000.dta



** start with .05 degree data
use grid_lights_drought, clear
sort x y year

* merge cross-sectional data that's at .05 degree cell level
merge m:1 x y using flares
drop if _merge==2
gen flare=(detection_frequency_2012>0) & _merge==3
drop _merge

summarize lights if flare==1, detail
replace lights=. if flare
replace ihslights=. if flare


*drop flare cells in the same manner as in cell-level data
gen x_10=floor((x-1)/10) + 1
gen y_10=floor((y-1)/10) + 1 
bys x_10 y_10: egen flaremin=min(flare)
replace lights=. if flaremin>0
replace ihslights=. if flaremin>0

merge m:1 x y using admin1
summarize x y if _merge==2, detail
drop _merge


*need to keep merging pfaf4 just to get continent ids
merge m:1 x y using pfaf4_grid
drop _merge
gen continent=int(conpfaf4/10000)



* variables to reflect frequency of various levels of drought by admin1

generate dsi_cat_temp=recode(drought,-1.5,-1.2,-.9,-.6,-.3,.3,.6,.9,1.2,1.5,4) if !missing(drought)
egen byte dsi_cat11=group(dsi_cat_temp) if !missing(drought)
drop dsi_cat_temp

gen esmfreq=(dsi_cat11<=3)

tabulate dsi_cat11, gen(dsi_cat_freq)




collapse (mean) lights ihslights drought esmfreq dsi_cat_freq* (count) cellsinadmin1=lights (firstnm) continent, by(admin1 year)

label var cellsinadmin1 "Num 0.05 degree cells in admin 1"

tempfile fine

save `fine', replace

* now get .5 degreee data
use  admin1, clear

gen x_10=floor((x-1)/10) + 1
gen y_10=floor((y-1)/10) + 1 

*pick county codes that is most common in .5 degree grid cell
bys x_10 y_10: egen admin1mode=mode(admin1), maxmode 

collapse (firstnm) admin1=admin1mode, by(x_10 y_10)


rename x_10 x
rename y_10 y

sort x y


* add temp data at half degree level
merge 1:m x y using annual_av_tmp
drop if _merge==2
drop _merge


* use WB definition of center of cells
gen lon=-179.75+.5*(x-1)
gen lat=74.75-.5*(y-1)


label var lon "Logitude (midpoint)"
label var lat "Latitude (midpoint)"


sort y x year

** merge sc-PDSI data
merge 1:1 y x year using pdsi, sorted
drop if _merge==2
drop _merge


merge m:1 y x using `popgrid'
drop if _merge==2
drop _merge


sort lon lat
merge m:1 lon lat using aqtyp_gwresource_grid05deg
drop if _merge==2
drop _merge


*** now collapse to admin level

sort admin1 year


generate pdsi_cat_temp=recode(pdsi_av,-4,-3,-2,-1,-.5,.5,1,2,3,4,30)
egen byte pdsi_cat=group(pdsi_cat_temp)


gen esmpfreq=(pdsi_cat<=3)
tabulate pdsi_cat, gen(pdsi_cat_freq)

drop pdsi_cat_temp pdsi_cat

collapse (mean) pdsi* *tmp aqtyp_pct* resource* pop2000, by(admin1 year)

** combine with more disaggreagated data

merge 1:1 admin1 year using `fine'
drop if _merge==1
drop _merge

* add dams data from spatial join
merge m:1 admin1 using `dams_admin1'
drop _merge


*create information on lights by pfaf4 over time so can exclude unpopulated places
bysort admin1: egen minlights=min(lights)
 
label var minlights "Min (over time) lights for state" 

drop if minlights==0 | missing(minlights)


** all labels after collapse


label var lights "Light index"
label var drought "Remote-sensed DSI"
label var tmp "Annual av temp"
label var dev_tmp "Temp anomaly"


*11 bins
generate dsi_cat_temp=recode(drought,-1.5,-1.2,-.9,-.6,-.3,.3,.6,.9,1.2,1.5,4)
egen byte dsi_cat11=group(dsi_cat_temp)
drop dsi_cat_temp
*7 bins
gen dsi_cat7=dsi_cat11
replace dsi_cat7=7 if dsi_cat11>=7


label define cat11 1 "Extreme drought"  2 "Severe drought" 3 "Moderate drought" 4 "Mild drought" ///
5 "Incipient drought" 6 "Near normal" 7 "Incipient wet spell"  8 "Slightly wet" 9 "Moderately wet" ///
10 "Very wet"  11 "Extremely wet"
label values dsi_cat11 cat11
label define cat7 1 "Extreme drought"  2 "Severe drought" 3 "Moderate drought" 4 "Mild drought" ///
5 "Incipient drought" 6 "Near normal" 7 "Wet"  
label values dsi_cat7 cat7
label var dsi_cat11 "DSI, 11 categories"
label var dsi_cat7 "DSI, 7 categories"


label var aqtyp_pct_ma "Major alluvial (%)" 
label var aqtyp_pct_cx "Complex aquifer (%)" 
label var aqtyp_pct_kt  "Karstic (%)"
label var aqtyp_pct_ls "Local/shallow aquifer (%)"
label var aqtyp_pct_NA "No aquifer data (%)" 
label var resource_pct_grid_na "Not covered by country-lithological-outcrop resource data (% of grid cell)"
label var resource "GW resource, sum w/in cell (10^9 m3/yr)" 
label var resource_norm "GW resource, normalized (10^9 m3/yr)"


*using definitions in van der Schrier article
generate pdsi_cat_temp=recode(pdsi_av,-4,-3,-2,-1,-.5,.5,1,2,3,4,30)
gen pdsi_summer_cat_temp=recode(pdsi_summer,-4,-3,-2,-1,-.5,.5,1,2,3,4,30)
gen pdsi_min_cat_temp=recode(pdsi_summer,-4,-3,-2,-1,-.5,.5,1,2,3,4,30)

*matching distribution of cut points in remote sensed DSI
*_pctile pdsi_av, percentiles(4, 8, 15, 25, 36, 63, 75, 85, 92, 97)
*generate pdsi_cat_temp=recode(pdsi_av,`r(r1)',`r(r2)',`r(r3)',`r(r4)',`r(r5)',`r(r6)',`r(r7)',`r(r8)',`r(r9)',`r(r10)',30)
egen byte pdsi_cat=group(pdsi_cat_temp)
egen byte pdsi_summer_cat=group(pdsi_summer_cat_temp)
egen byte pdsi_min_cat=group(pdsi_min_cat_temp)

drop pdsi_cat_*temp 

label var pdsi_cat "sc-PDSI categories"
label var pdsi_summer_cat "sc-PDSI categories in June or Dec"
label var pdsi_min_cat "sc-PDSI categories at annual min"

label define pcat 1 "Extremely dry"  2 "Severely dry" 3 "Moderately dry" 4 "Mildly dry" ///
5 "Incipient drought" 6 "Near normal" 7 "Incipient wet spell"  8 "Slightly wet" 9 "Moderately wet" ///
10 "Very wet"  11 "Extremely wet"
label values pdsi_cat pcat
label values pdsi_summer_cat pcat
label values pdsi_min_cat pcat

label define contlabel 1 "AF" 2 "AS" 3 "AU" 4 "EU" 5 "NA" 6 "SA"

label value continent contlabel

compress

label data "Data at the Admin 1 level"
cd "../code" 

save for_reg_admin1, replace
timer off 1
timer list 1
log close

timer clear 1